#!/usr/bin/env python3

# Given a list of Graph Modeling Language (GML) files, assume that each node
# is a Wikipedia article in the language encoded in the filename. Read these
# data and create two files in $PWD:
#
#   wiki-graph.pkl.gz -- link graph extract created below
#   articles.txt -- sorted list all article URLs found, suitable for tssearch

import os
import re
import sys

QUACLIB = os.path.abspath(os.path.dirname(__file__) + '/../../lib')
sys.path.insert(0, QUACLIB)
import u

ART_RE = re.compile(r'label "(.+)"$')

artmap = dict()
arts_all = set()

for fn in sys.argv[1:]:
   root = os.path.basename(fn)
   artmap[root] = { root }
   lang = root.split('+')[0]
   with open(fn) as fp:
      for line in fp:
         m = ART_RE.search(line)
         if (m is not None):  # actually a label line
            url = '%s+%s' % (lang, m.group(1))
            artmap[root].add(url)
            arts_all.add(url)
      assert (root in artmap[root]), root

u.pickle_dump('wiki-graph', artmap)

with open('articles.txt', 'wt', encoding='ascii') as fp:
   fp.write('\n'.join(sorted(arts_all)))
   fp.write('\n')
